From: Joey Hess Date: Wed, 31 May 2023 16:31:14 +0000 (-0400) Subject: build git trees using ContentIdentifier to speed up import X-Git-Tag: archive/raspbian/10.20250416-2+rpi1~1^2~40^2~119 X-Git-Url: https://dgit.raspbian.org/%22http:/www.example.com/cgi/%22https://%22Program/%22http:/www.example.com/cgi/%22https:/%22Program?a=commitdiff_plain;h=72981235204b5065676880be42cf64079efae881;p=git-annex.git build git trees using ContentIdentifier to speed up import This gets the trees built, but it does not use them. Next step will be to remember the tree for next time an import is done, and diff between old and new trees to find the files that have changed. Added --missing to the mktree parameters. That only disables a check, so it's ok to do everywhere mktree is used. It probably also speeds up mktree to disable the check. Note that git fsck does not complain about the resulting tree objects that point to shas that are not in the repository. Even with --strict. A quick benchmark, importing 10000 files, this slowed it down from 2:04.06 to 2:04.28. So it will more than pay for itself. Sponsored-by: Luke Shumaker on Patreon --- diff --git a/Annex/Import.hs b/Annex/Import.hs index 9f9a2d6f50..fccbc5dda4 100644 --- a/Annex/Import.hs +++ b/Annex/Import.hs @@ -1,6 +1,6 @@ {- git-annex import from remotes - - - Copyright 2019-2021 Joey Hess + - Copyright 2019-2023 Joey Hess - - Licensed under the GNU AGPL version 3 or higher. -} @@ -46,6 +46,7 @@ import Types.KeySource import Messages.Progress import Utility.DataUnits import Utility.Metered +import Utility.Hash (sha1s) import Logs.Export import Logs.Location import Logs.PreferredContent @@ -62,6 +63,7 @@ import qualified Data.Map.Strict as M import qualified Data.Set as S import qualified System.FilePath.Posix.ByteString as Posix import qualified System.FilePath.ByteString as P +import qualified Data.ByteArray.Encoding as BA {- Configures how to build an import tree. -} data ImportTreeConfig @@ -269,15 +271,76 @@ buildImportTrees -> Maybe TopFilePath -> ImportableContentsChunkable Annex (Either Sha Key) -> Annex (History Sha) -buildImportTrees basetree msubdir (ImportableContentsComplete importable) = do +buildImportTrees = buildImportTreesGeneric convertImportTree + +convertImportTree :: Maybe TopFilePath -> [(ImportLocation, Either Sha Key)] -> Annex Tree +convertImportTree msubdir ls = treeItemsToTree <$> mapM mktreeitem ls + where + mktreeitem (loc, v) = case v of + Right k -> do + relf <- fromRepo $ fromTopFilePath topf + symlink <- calcRepo $ gitAnnexLink relf k + linksha <- hashSymlink symlink + return $ TreeItem treepath (fromTreeItemType TreeSymlink) linksha + Left sha -> + return $ TreeItem treepath (fromTreeItemType TreeFile) sha + where + lf = fromImportLocation loc + treepath = asTopFilePath lf + topf = asTopFilePath $ + maybe lf (\sd -> getTopFilePath sd P. lf) msubdir + +{- Builds a history of git trees using ContentIdentifiers. + - + - These are not the final trees that are generated by the import, which + - use Keys. The purpose of these trees is to allow quickly determining + - which files in the import have changed, and which are unchanged, to + - avoid needing to look up the Keys for unchanged ContentIdentifiers. + - When the import has a large number of files, that can be slow. + -} +buildContentIdentifierTree + :: ImportableContentsChunkable Annex (ContentIdentifier, ByteSize) + -> Annex (History Sha) +buildContentIdentifierTree = + buildImportTreesGeneric convertContentIdentifierTree emptyTree Nothing + +{- For speed, and to avoid bloating the repository, the ContentIdentifiers + - are not actually checked into git, instead a sha1 hash is calculated + - internally. + -} +convertContentIdentifierTree + :: Maybe TopFilePath + -> [(ImportLocation, (ContentIdentifier, ByteSize))] + -> Annex Tree +convertContentIdentifierTree _ ls = pure $ treeItemsToTree $ map mktreeitem ls + where + mktreeitem (loc, ((ContentIdentifier cid), _sz)) = + TreeItem p mode sha1 + where + p = asTopFilePath (fromImportLocation loc) + mode = fromTreeItemType TreeFile + -- Note that this hardcodes sha1, even if git has started + -- defaulting to some other checksum method. That should be + -- ok, hopefully. This checksum never needs to be verified + -- by git, which is why this does not bother to prefix the + -- cid with its length, like git would. + sha1 = Ref $ BA.convertToBase BA.Base16 $ sha1s cid + +buildImportTreesGeneric + :: (Maybe TopFilePath -> [(ImportLocation, v)] -> Annex Tree) + -> Ref + -> Maybe TopFilePath + -> ImportableContentsChunkable Annex v + -> Annex (History Sha) +buildImportTreesGeneric converttree basetree msubdir (ImportableContentsComplete importable) = do repo <- Annex.gitRepo - withMkTreeHandle repo $ buildImportTrees' basetree msubdir importable -buildImportTrees basetree msubdir importable@(ImportableContentsChunked {}) = do + withMkTreeHandle repo $ buildImportTreesGeneric' converttree basetree msubdir importable +buildImportTreesGeneric converttree basetree msubdir importable@(ImportableContentsChunked {}) = do repo <- Annex.gitRepo withMkTreeHandle repo $ \hdl -> History <$> go hdl - <*> buildImportTreesHistory basetree msubdir + <*> buildImportTreesHistory converttree basetree msubdir (importableHistoryComplete importable) hdl where go hdl = do @@ -291,7 +354,7 @@ buildImportTrees basetree msubdir importable@(ImportableContentsChunked {}) = do let fullprefix = asTopFilePath $ case msubdir of Nothing -> subdir Just d -> getTopFilePath d Posix. subdir - Tree ts <- convertImportTree (Just fullprefix) $ + Tree ts <- converttree (Just fullprefix) $ map (\(p, i) -> (mkImportLocation p, i)) (importableContentsSubTree c) -- Record this subtree before getting next chunk, this @@ -302,24 +365,26 @@ buildImportTrees basetree msubdir importable@(ImportableContentsChunked {}) = do Nothing -> return (Tree (tc:l)) Just c' -> gochunks (tc:l) c' hdl -buildImportTrees' - :: Ref +buildImportTreesGeneric' + :: (Maybe TopFilePath -> [(ImportLocation, v)] -> Annex Tree) + -> Ref -> Maybe TopFilePath - -> ImportableContents (Either Sha Key) + -> ImportableContents v -> MkTreeHandle -> Annex (History Sha) -buildImportTrees' basetree msubdir importable hdl = History - <$> buildImportTree basetree msubdir (importableContents importable) hdl - <*> buildImportTreesHistory basetree msubdir (importableHistory importable) hdl +buildImportTreesGeneric' converttree basetree msubdir importable hdl = History + <$> buildImportTree converttree basetree msubdir (importableContents importable) hdl + <*> buildImportTreesHistory converttree basetree msubdir (importableHistory importable) hdl buildImportTree - :: Ref + :: (Maybe TopFilePath -> [(ImportLocation, v)] -> Annex Tree) + -> Ref -> Maybe TopFilePath - -> [(ImportLocation, Either Sha Key)] + -> [(ImportLocation, v)] -> MkTreeHandle -> Annex Sha -buildImportTree basetree msubdir ls hdl = do - importtree <- liftIO . recordTree' hdl =<< convertImportTree msubdir ls +buildImportTree converttree basetree msubdir ls hdl = do + importtree <- liftIO . recordTree' hdl =<< converttree msubdir ls graftImportTree basetree msubdir importtree hdl graftImportTree @@ -333,31 +398,15 @@ graftImportTree basetree msubdir tree hdl = case msubdir of Just subdir -> inRepo $ \repo -> graftTree' tree subdir basetree repo hdl -convertImportTree :: Maybe TopFilePath -> [(ImportLocation, Either Sha Key)] -> Annex Tree -convertImportTree msubdir ls = treeItemsToTree <$> mapM mktreeitem ls - where - mktreeitem (loc, v) = case v of - Right k -> do - relf <- fromRepo $ fromTopFilePath topf - symlink <- calcRepo $ gitAnnexLink relf k - linksha <- hashSymlink symlink - return $ TreeItem treepath (fromTreeItemType TreeSymlink) linksha - Left sha -> - return $ TreeItem treepath (fromTreeItemType TreeFile) sha - where - lf = fromImportLocation loc - treepath = asTopFilePath lf - topf = asTopFilePath $ - maybe lf (\sd -> getTopFilePath sd P. lf) msubdir - buildImportTreesHistory - :: Ref + :: (Maybe TopFilePath -> [(ImportLocation, v)] -> Annex Tree) + -> Ref -> Maybe TopFilePath - -> [ImportableContents (Either Sha Key)] + -> [ImportableContents v] -> MkTreeHandle -> Annex (S.Set (History Sha)) -buildImportTreesHistory basetree msubdir history hdl = S.fromList - <$> mapM (\ic -> buildImportTrees' basetree msubdir ic hdl) history +buildImportTreesHistory converttree basetree msubdir history hdl = S.fromList + <$> mapM (\ic -> buildImportTreesGeneric' converttree basetree msubdir ic hdl) history canImportKeys :: Remote -> Bool -> Bool canImportKeys remote importcontent = @@ -389,6 +438,8 @@ importKeys -> ImportableContentsChunkable Annex (ContentIdentifier, ByteSize) -> Annex (Maybe (ImportableContentsChunkable Annex (Either Sha Key))) importKeys remote importtreeconfig importcontent thirdpartypopulated importablecontents = do + _ts <- buildContentIdentifierTree importablecontents + -- TODO use above unless (canImportKeys remote importcontent) $ giveup "This remote does not support importing without downloading content." -- This map is used to remember content identifiers that diff --git a/Git/Tree.hs b/Git/Tree.hs index 8c034be4cd..c811b111c8 100644 --- a/Git/Tree.hs +++ b/Git/Tree.hs @@ -77,7 +77,7 @@ withMkTreeHandle :: (MonadIO m, MonadMask m) => Repo -> (MkTreeHandle -> m a) -> withMkTreeHandle repo a = bracketIO setup cleanup (a . MkTreeHandle) where setup = gitCoProcessStart False ps repo - ps = [Param "mktree", Param "--batch", Param "-z"] + ps = [Param "mktree", Param "--missing", Param "--batch", Param "-z"] cleanup = CoProcess.stop {- Records a Tree in the Repo, returning its Sha. diff --git a/Utility/Hash.hs b/Utility/Hash.hs index 835409f482..9cba7f6056 100644 --- a/Utility/Hash.hs +++ b/Utility/Hash.hs @@ -10,6 +10,7 @@ module Utility.Hash ( sha1, sha1_context, + sha1s, sha2_224, sha2_224_context, sha2_256, @@ -84,6 +85,9 @@ sha1 = hashlazy sha1_context :: Context SHA1 sha1_context = hashInit +sha1s :: S.ByteString -> Digest SHA1 +sha1s = hash + sha2_224 :: L.ByteString -> Digest SHA224 sha2_224 = hashlazy